import warnings
warnings.filterwarnings('ignore')
# !pip install mlxtend
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
# import os
# print(os.listdir("../input"))
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from lightgbm import LGBMClassifier
from sklearn.preprocessing import Normalizer
# Any results you write to the current directory are saved as output.
df= pd.read_csv("Tweets.csv")
df.head()
pd.set_option('display.max_colwidth', None) # It will enable the entire row visible with truncation of the text. (We can see full text.)
df.loc[[0]]
print("Shape of the dataframe is",df.shape, "\n")
print("The number of nulls in each column are: \n",df.isna().sum())
print("Percentage null or na values in df")
((df.isnull() | df.isna()).sum() * 100 / df.index.size).round(2)
df.describe()
df.airline.unique()
print("Total number of tweets for each airline \n ",df.groupby('airline')['airline_sentiment'].count().sort_values(ascending=False))
# airlines= ['US Airways','United','American','Southwest','Delta','Virgin America']
airlines = df.airline.unique().tolist()
plt.figure(1,figsize=(12, 12))
for i in airlines:
indices= airlines.index(i)
plt.subplot(2,3,indices+1)
new_df=df[df['airline']==i]
count=new_df['airline_sentiment'].value_counts()
Index = [1,2,3]
plt.bar(Index,count, color=['red', 'green', 'blue'])
plt.xticks(Index,['negative','neutral','positive'])
plt.ylabel('Mood Count')
plt.xlabel('Mood')
plt.title('Count of Moods of '+i)
# pip install wordcloud
from wordcloud import WordCloud,STOPWORDS
new_df=df[df['airline_sentiment']=='negative']
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
new_df=df[df['airline_sentiment']=='positive']
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
if 'http' not in word
and not word.startswith('@')
and word != 'RT'
])
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='black',
width=3000,
height=2500
).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
# Calculate highest frequency words in positive tweets
def freq(str):
# break the string into list of words
str = str.split()
str2 = []
# loop till string values present in list str
for i in str:
# checking for the duplicacy
if i not in str2:
# insert value in str2
str2.append(i)
for i in range(0, len(str2)):
if(str.count(str2[i])>50):
print('Frequency of', str2[i], 'is :', str.count(str2[i]))
print(freq(cleaned_word))
#get the number of negative reasons
df['negativereason'].nunique()
NR_Count=dict(df['negativereason'].value_counts(sort=False))
def NR_Count(Airline):
if Airline=='All':
a=df
else:
a=df[df['airline']==Airline]
count=dict(a['negativereason'].value_counts())
Unique_reason=list(df['negativereason'].unique())
Unique_reason=[x for x in Unique_reason if str(x) != 'nan']
Reason_frame=pd.DataFrame({'Reasons':Unique_reason})
Reason_frame['count']=Reason_frame['Reasons'].apply(lambda x: count[x])
return Reason_frame
def plot_reason(Airline):
a=NR_Count(Airline)
count=a['count']
Index = range(1,(len(a)+1))
plt.bar(Index,count, color=['red','yellow','blue','green','black','brown','gray','cyan','purple','orange'])
plt.xticks(Index,a['Reasons'],rotation=90)
plt.ylabel('Count')
plt.xlabel('Reason')
plt.title('Count of Reasons for '+Airline)
plot_reason('All')
plt.figure(2,figsize=(13, 13))
for i in airlines:
indices= airlines.index(i)
plt.subplot(2,3,indices+1)
plt.subplots_adjust(hspace=0.9)
plot_reason(i)
Our dataframe has data from 2015-02-17 to 2015-02-24
It will be interesting to see if the date has any effect on the sentiments of the tweets(especially negative !). We can draw various coclusions by visualizing this.
date = df.reset_index()
#convert the Date column to pandas datetime
date.tweet_created = pd.to_datetime(date.tweet_created)
#Reduce the dates in the date column to only the date and no time stamp using the 'dt.date' method
date.tweet_created = date.tweet_created.dt.date
date.tweet_created.head()
df = date
day_df = df.groupby(['tweet_created','airline','airline_sentiment']).size()
# day_df = day_df.reset_index()
day_df
This shows the sentiments of tweets for each date from 2015-02-17 to 2015-02-24 for every airline in our dataframe.
Our next step will be to plot this and get better visualization for negative tweets.
day_df = day_df.loc(axis=0)[:,:,'negative']
#groupby and plot data
ax2 = day_df.groupby(['tweet_created','airline']).sum().unstack().plot(kind = 'bar', color=['red', 'green', 'blue','yellow','purple','orange'], figsize = (15,6), rot = 70)
labels = ['American','Delta','Southwest','US Airways','United','Virgin America']
ax2.legend(labels = labels)
ax2.set_xlabel('Date')
ax2.set_ylabel('Negative Tweets')
plt.show()
df = df[['text', 'airline_sentiment']]
print("Shape of the dataframe is",df.shape, "\n")
df.head(5)
#
# Function to remove the contractions
#
import contractions
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
#
# update the stopwords to not have 'not', 'no' to keep these critical words that effects the meaning of the sentiments
# also add word 'pep' for the same reason
#
from nltk.corpus import stopwords
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.append('pep')
print (stopwords.words("english"))
#
# function to remove accented characters
#
import unicodedata
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
#
# function to remove special characters and optionally digits
#
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
#
# collection of functions that performs pre-processing on the tweet
# 1. Remove HTML tags using 'BeautifulSoup'
# 2. Tokenize using 'NLTK'
# 3. Replace contractions using 'contractions' library
# 4. Remove accented characters using unicodedata library
# 5. Remove special characters and digits using regex
# 6. convert all letters to lowercase
# 7. Remove stopwords
# 8. join the tokenized words to make it a text again
#
# On this cleaned data we will perform stemming and lemmatization
#
from bs4 import BeautifulSoup
def tweet_to_words( raw_review ):
#
# Function to convert a raw review to a string of words
# The input is a single string (a raw movie review), and
# the output is a single string (a preprocessed movie review)
#
#
# 1. Remove HTML
#
review_text = BeautifulSoup(raw_review).get_text()
#
# 2. Tokenize
#
words = nltk.word_tokenize(review_text)
#
# 3. Replace contractions
#
review_text = replace_contractions(review_text)
#
# 4. Remove accented characters
#
review_text = remove_accented_chars(review_text)
#
# 5. Remove non-letters (special characters)
# letters_only = re.sub("[^a-zA-Z]", " ", review_text)
#
letters_only = remove_special_characters(review_text, remove_digits = True)
#
# 6. Convert to lower case, split into individual words
#
words = letters_only.lower().split()
#
# 7. Remove stop words
# In Python, searching a set is much faster than searching
# a list, so convert the stop words to a set
stops = stopwords.words("english")
stops.remove('no')
stops.remove('not')
stops.append('pep')
stops = set(stops)
meaningful_words = [w for w in words if not w in stops]
#
# 8. Join the words back into one string separated by space,
# and return the result.
#
return( " ".join( meaningful_words ))
#
# clean the 'text' column in a new column 'clean_tweet'
#
df['clean_tweet'] = df['text'].apply(lambda x: tweet_to_words(x))
#
# check the cleaned tweet in 'clean_tweet'
#
df.head(5)
#
# Functions to perform stemming
#
from nltk.stem import LancasterStemmer, WordNetLemmatizer
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = [] # Create empty list to store pre-processed words.
for word in words:
stem = stemmer.stem(word)
stems.append(stem) # Append processed words to new list.
return stems
def simple_stemmer(text):
ps = nltk.porter.PorterStemmer()
text = ' '.join([ps.stem(word) for word in text.split()])
return text
#
# Functions to perform lemmatization
#
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = [] # Create empty list to store pre-processed words.
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma) # Append processed words to new list.
return lemmas
import spacy
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
def lemmatize_text(text):
text = nlp(text) # encode to spacy format
# -PRON- => proper noun
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
#
# Have a single function to perform stemming and lemmatization
#
def stem_and_lemmatize(words):
stems = stem_words(words)
lemmas = lemmatize_verbs(words)
return stems, lemmas
def simple_stem_and_lemmatize(words):
stems = simple_stemmer(words)
lemmas = lemmatize_text(words)
return stems, lemmas
#
# Create columns for stemmed and lemmatized text. Stemming and Lemmatization are performed over the 'clean_tweet' text
#
df['lemma'] = ''
df['stem'] = ''
for i, row in df.iterrows():
words = df.at[i, 'clean_tweet']
# stems, lemmas = stem_and_lemmatize(words)
stems, lemmas = simple_stem_and_lemmatize(words)
df.at[i,'stem'] = stems
df.at[i, 'lemma'] = lemmas
#
# check the stemmed and lemmatized text
#
df.head()
#
# The data is split in the standard 80,20 ratio
#
train,test = train_test_split(df,test_size=0.2,random_state=42, stratify=df.airline_sentiment)
#
# Generate test and train data from the cleaned and 'stem' text
#
# train_clean_tweet_stem=[]
# for tweet in train['stem']:
# train_clean_tweet_stem.append(tweet)
# test_clean_tweet_stem=[]
# for tweet in test['stem']:
# test_clean_tweet_stem.append(tweet)
# #
# # Generate test and train data from the cleaned and 'lemma' text
# #
# train_clean_tweet_lemma=[]
# for tweet in train['lemma']:
# train_clean_tweet_lemma.append(tweet)
# test_clean_tweet_lemma=[]
# for tweet in test['lemma']:
# test_clean_tweet_lemma.append(tweet)
from sklearn.feature_extraction.text import CountVectorizer
# print ("Creating CountVectorized bag of words...\n")
# # Initialize the "CountVectorizer" object, which is scikit-learn's
# # bag of words tool.
# v_cv = CountVectorizer(analyzer = "word")
# train_features_cv = v_cv.fit_transform(train_clean_tweet_stem)
# test_features_cv = v_cv.transform(test_clean_tweet_stem)
# # summarize v_cd
# # print(v_cv.vocabulary_)
# print(train_features_cv.shape)
# # print(type(train_features_cv))
# # print(train_features_cv.toarray())
from sklearn.feature_extraction.text import TfidfVectorizer
# # create the transform
# v_tfidf = TfidfVectorizer()
# # tokenize and build vocab
# train_features_tfidf = v_tfidf.fit_transform(train_clean_tweet_stem)
# test_features_tfidf = v_tfidf.transform(test_clean_tweet_stem)
# print(test_features_tfidf.shape)
df_result_mod_train = pd.DataFrame({'Classifier':[],'processed_data':[],'Vectorizer':[], 'Accuracy':[]})
ind = 0
Classifiers = [
DecisionTreeClassifier(),
RandomForestClassifier(n_jobs=-1,n_estimators=500),
GradientBoostingClassifier(n_estimators=500),
LogisticRegression(max_iter=500),
SVC(),
KNeighborsClassifier(n_neighbors=3),
AdaBoostClassifier(),
GaussianNB()]
from yellowbrick.classifier import ClassificationReport, ROCAUC
def visClassifierResults(model_w_parameters, X_train, y_train, X_test, y_test):
viz = ClassificationReport(model_w_parameters)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()
roc = ROCAUC(model_w_parameters)
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()
def get_classification_report(y_test, y_pred):
report = metrics.classification_report(y_test, y_pred, output_dict=True)
df_classification_report = pd.DataFrame(report).transpose()
df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
return df_classification_report
def fit_classifier(train_data, test_data, col_name, vectorizer_name):
global ind
dense_features=train_data.toarray()
dense_test= test_data.toarray()
Accuracy=[]
Model=[]
for classifier in Classifiers:
try:
fit = classifier.fit(train_data,train['airline_sentiment'])
pred = fit.predict(test_data)
except Exception:
fit = classifier.fit(dense_features,train['airline_sentiment'])
pred = fit.predict(dense_test)
accuracy = accuracy_score(pred,test['airline_sentiment'])
Accuracy.append(accuracy)
Model.append(classifier.__class__.__name__)
print(classifier.__class__.__name__+' on cleaned '+col_name+' text with vectorizer: '+vectorizer_name)
print('==============================================================================================')
print('Accuracy of '+classifier.__class__.__name__+' on cleaned '+col_name+' text with vectorizer: '+vectorizer_name+' is '+str(accuracy))
report = classification_report(pred,test['airline_sentiment'])
print(report)
df_result_mod_train.loc[ind] = [classifier.__class__.__name__,
col_name,
vectorizer_name,
accuracy]
ind = ind + 1
cm=confusion_matrix(pred , test['airline_sentiment'])
plt.figure()
plot_confusion_matrix(cm,figsize=(12,8), hide_ticks=True,cmap=plt.cm.Reds)
plt.xticks(range(3), ['Negative', 'Neutral', 'Positive'], fontsize=16,color='black')
plt.yticks(range(3), ['Negative', 'Neutral', 'Positive'], fontsize=16)
plt.show()
visClassifierResults(classifier, dense_features, train['airline_sentiment'], dense_test, test['airline_sentiment'])
def process_column(col):
train_clean_tweet=[]
for tweet in train[col]:
train_clean_tweet.append(tweet)
test_clean_tweet=[]
for tweet in test[col]:
test_clean_tweet.append(tweet)
print ("Creating CountVectorized bag of words...\n")
v_cv = CountVectorizer(analyzer = "word")
train_features_cv = v_cv.fit_transform(train_clean_tweet)
test_features_cv = v_cv.transform(test_clean_tweet)
#norm_cv = Normalizer()
#train_features_cv = norm_cv.fit_transform(train_features_cv)
#test_features_cv = norm_cv.fit_transform(test_features_cv)
print("CountVectorized shape: ", train_features_cv.shape)
# Fit the DTM generated using CountVectorizer
fit_classifier(train_features_cv, test_features_cv, col, 'CountVectorized')
v_tfidf = TfidfVectorizer()
# tokenize and build vocab
train_features_tfidf = v_tfidf.fit_transform(train_clean_tweet)
test_features_tfidf = v_tfidf.transform(test_clean_tweet)
#norm_tdf = Normalizer()
#train_features_tfidf = norm_cv.fit_transform(train_features_tfidf)
#test_features_tfidf = norm_cv.fit_transform(test_features_tfidf)
print("TfIDF shape: ", test_features_tfidf.shape)
# Fit the DTM generated using TF-IDF
fit_classifier(train_features_tfidf, test_features_tfidf, col, 'TfidfVectorizer')
# Fit the DTM generated using 'stem' cleaned corpus on both vectorizer
process_column('stem')
df_result_mod_train.sort_values(by='Accuracy', ascending=False, ignore_index=True)
# Fit the DTM generated using 'lemma' cleaned corpus on both vectorizer
process_column('lemma')
# process_column('clean_tweet')
df_result_mod_train.sort_values(by='Accuracy', ascending=False, ignore_index=True)
# process_column('text')
# Now we will use the 'CountVectorizer' with 'LGBMClassifier'
# pip3 install lightgbm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import CountVectorizer
twitter_sentiment = Pipeline([('CVec', CountVectorizer(CountVectorizer(stop_words='english'))),
('Tfidf', TfidfTransformer()),
('norm', Normalizer()),
('tSVD', TruncatedSVD(n_components=100)),
('lgb', LGBMClassifier(n_jobs=-1))])
from sklearn.model_selection import cross_validate
def run_cv(col):
cv_pred = cross_validate(twitter_sentiment,
df[col],
df['airline_sentiment'],
cv=5,
scoring=('roc_auc_ovr'))
print ("Score on column '" , col, "' on each fold: ", cv_pred['test_score'])
%%time
run_cv('clean_tweet')
%%time
run_cv('stem')
%%time
run_cv('lemma')
# Using
df_result_mod_train.sort_values(by='Accuracy', ascending=False, ignore_index=True)
Remove HTML/XML tags: This will remove all special tags that are specific to a programming language but not provide and information when the model is trained and analyzed. It will make the corpus with removed tags. Data can also be fetch from locations that have other tags and it is recommended to be removed.
Tokenize using 'NLTK': Tokenization is a way of separating a piece of text into smaller units called tokens. Here, tokens can be either words, characters, or subwords. Hence, tokenization can be broadly classified into 3 types – word, character, and subword (n-gram characters) tokenization. As tokens are the building blocks of Natural Language, the most common way of processing the raw text happens at the token level.
Replace contractions using 'contractions' library: This is to expend the contractions to separate words to allow consistency in the corpus
Remove accented characters using unicodedata library: Certain languages have accented characters that will be inconsistent with the other text in the corpus. NLP pre-processing libraries allow remove the accented characters and convert the word to ascii or utf-8
Remove special characters and digits using regex: It is also advisable to keep only displayable and language interpretable words to allow model to perform better. In this example, we also removed the digits since it was not helping to capture the sentiments.
convert all letters to lowercase: A upper and lower case are 2 different characters and model/machine will treat it as a separate words. It is better to run all the words to either upper or lower case to minimize the features and also keep the same words, either upper or lower as a same feature. In many cases, text may keep all letters in a word uppercase to stress or emphasize the emotion. In the case of sentiment analysis, it may not be needed to collect all the BOLD letters and we turned all words to lowercase.
Remove stopwords: Stop words are a set of commonly used words in a language. Examples of stop words in a language are “a”, “the”, “is”, “are” and etc. The intuition behind using stop words is that, by removing low information words from text, we can focus on the important words instead.
Stemming: Stemming is the process of reducing inflection in words (e.g. troubled, troubles) to their root form (e.g. trouble). The “root” in this case may not be a real root word, but just a canonical form of the original word. By doing stemming, we are reducing the features and therefore avoiding curse of dimensionality without compromizing on the information.
Lemmatization: Lemmatization on the surface is very similar to stemming, where the goal is to remove inflections and map a word to its root form. The only difference is that, lemmatization tries to do it the proper way. It doesn’t just chop things off, it actually transforms words to the actual root.
We have the accuracy scores of all the models in the table with cleaned 'stem' and 'lemma' corpus.
Accuracy of LogisticRegression on cleaned stem text with vectorizer: CountVectorized is 0.7961065573770492 precision recall f1-score support
negative 0.90 0.86 0.88 1909
neutral 0.62 0.62 0.62 613
positive 0.64 0.75 0.69 406
Accuracy of LogisticRegression on cleaned stem text with vectorizer: TfidfVectorizer is 0.7988387978142076 precision recall f1-score support
negative 0.94 0.83 0.88 2065
neutral 0.56 0.66 0.61 520
positive 0.58 0.80 0.67 343
We also provided output of TP, TN, FP, FN against the multi-class sentiment output that can be then used to analyze the model's performance. The accuracy provides the overall efficiency of the model and the best performing models were Logistic, SVC and LGBMClassifier. With more text and tweets, we can improve the models performance further.